/* OPTIONS OBS=5000  NOREPLACE ;  */

%include "ASMimplibs.sas";


/** select_vars_all_industries_from_CMF.sas -- 

Selects all tabulated non-AR plants from 2002 and 2007 CMFs.
Determines which variables are imputed based on the item-level edit/impute flags and creates a sample of only "good" 
(i.e., not tabulated, non-AR, not imputed) data.

04/05/2015: Modified to save the gooddata files (with imputes made missing) to
            the allcmf library.  I will use these files to create new CART imputations
            for all manufacturing industries.  

	    Also modified to read in all of the variables that we need to compute plant TFP.

***/


data allinds02 (KEEP= survu_id  firmid tab_f tae tae_f tce_f trt_f ee ee_f 
           cf cf_f cm cm_f ph ph_f ww ww_f sw te te_f tvs tvs_f tib tib_f tie tie_f NAICS_NEW_6);
 set cmf.cmf2002; 
 NAICS_NEW_6 = substr(NAICS_NEW,1,6);
 if ar ne 1 and TABBED="Y";  /* drop Administrative Records (AR=1) and keep only tabulated observations. */                
run;


data allinds07 (KEEP=survu_id  firmid tab_f tae tae_f tce_f trt_f ee ee_f 
           cf cf_f cm cm_f ph ph_f ww ww_f sw te te_f tvs tvs_f tib tib_f tie tie_f NAICS_NEW_6);
 set cmf.cmf2007 ;
 NAICS_NEW_6 = substr(NAICS_NEW,1,6);
 if ar ne 1 and TABBED="Y";
run;

proc freq data=allinds07;
 tables tae_f;
title "TAE edit/impute flags, 2007 Census";
run;


%MACRO getgooddata(year=);

data allinds&year._alldata (keep =survu_id firmid NAICS_NEW_6 tvs cm ee cf ph tae tvs_imp cm_imp cf_imp ee_imp ph_imp tae_imp year
                                            taetvs phtvs eetvs cftvs cmtvs);
 set allinds&year;
           /* TVS, CM: Treat as imputed and make missing if
               TVS_F, CM_F flag is B, H, L, M, S, V, RB, RE, RH, RL, RM, or RS 
             This keeps items with flags set to A,C,R,RA,RC,RG or blank. */

            if TVS_F in (' B',' H',' L',' M',' S',' V','RB','RE','RH','RL','RM','RS') 
            then tvs_imp = 1;
            else tvs_imp = 0;

            if cm_F in (' B',' H',' L',' M',' S',' V','RB','RE','RH','RL','RM','RS')
            then cm_imp = 1;
            else cm_imp = 0;

           /* EE: Treat as imputed and make missing if
               EE_F flag is H,V,RH, or RV  */
              if ee_f in (' H',' V','RH','RV') 
              then ee_imp = 1;
              else ee_imp = 0;


           /* CF: Treat as imputed and make missing if
               CF_F flag is H,V,RH, or RV */
              if cf_f in (' H',' V','RH','RV') 
              then cf_imp = 1;
              else cf_imp = 0;


           /* PH:  Treat as imputed and make missing if
              (1) PH_F is B, H, M, or blank 
           */
              if PH_F in (' B',' H',' M')
              THEN ph_imp = 1;
              else ph_imp = 0;

           /* TAE in 2002:  Treat as imputed and make missing if
               The flags for all 3 capital items are 'K' ("raked") 
             This keeps TAE if one of the flags is C,N,R,RC,RN, or RK. */
             %if &year=02 %then 
             %do; 
               if TAB_F = ' K' and TAE_F=' K' and TCE_F=' K' 
               then tae_imp = 1;
               else tae_imp = 0;
             %end;
             %else %if &year=07 %then
             %do;
               if TAE_F in ( ' B', ' V')  
               then tae_imp = 1;
               else tae_imp = 0;
             %end;

             year = "20&year";

             if tvs>0 then 
             do;
               taetvs=tae/tvs; 
               phtvs=ph/tvs;
               eetvs=ee/tvs;
               cftvs=cf/tvs;
               cmtvs=cm/tvs; 
             end;
             else
             do; 
               taetvs=.;
               phtvs=.; 
               eetvs=.;
               cftvs=.;
               cmtvs=.; 
             end;

        run;

%MEND ;


%getgooddata(year=02);
%getgooddata(year=07);


data alldata;
 set allinds02_alldata allinds07_alldata;
run;

proc sort data=alldata;
 by year NAICS_NEW_6;
run;

%MACRO getindimprate(var=);

 proc means data=alldata mean NOPRINT;
 var &var._imp;
 by year NAICS_NEW_6;
 output out=&var._imp_rates (keep = year NAICS_NEW_6 &var._imp_rate) mean=&var._imp_rate;
 run;

%MEND;

%getindimprate(var=tvs);
%getindimprate(var=cm);
%getindimprate(var=cf);
%getindimprate(var=ee);
%getindimprate(var=ph);
%getindimprate(var=tae);


data imprates;
 merge tvs_imp_rates cm_imp_rates cf_imp_rates ee_imp_rates ph_imp_rates tae_imp_rates;
by year NAICS_NEW_6;
run;



proc sort data=imprates;
by year;
run;

/**************/
/*  Table 1  */
/*************/

proc means data=imprates mean stddev;
 var tvs_imp_rate cm_imp_rate cf_imp_rate ee_imp_rate ph_imp_rate tae_imp_rate ;
 by year; 
title1 "Table 1: Means and Standard Deviations of NAICS-6 Industry Imputation Rates";
title2 "Tabulated, non-AR plants in 2002 and 2007 Censuses of Manufactures"; 
run;


data goodtaetvs imptaetvs goodphtvs impphtvs goodeetvs impeetvs goodcftvs impcftvs goodcmtvs impcmtvs;
 set alldata;
 if tvs_imp=0 then
 do;
  if tae_imp=0 then output goodtaetvs; else output imptaetvs;
  if ph_imp=0 then output goodphtvs; else output impphtvs;
  if ee_imp=0 then output goodeetvs; else output impeetvs;
  if cf_imp=0 then output goodcftvs; else output impcftvs;
  if cm_imp=0 then output goodcmtvs; else output impcmtvs;
 end;
 else
 do;
  output imptaetvs;
  output impphtvs;
  output impeetvs;
  output impcftvs;
  output impcmtvs;
 end;
run;

%MACRO calc_iqr_of_ratio(ratio=,data=);
 proc means data=&data q1 q3 NOPRINT;
 var &ratio;
 by year NAICS_NEW_6;
 output out=&data._iqrs (keep = year NAICS_NEW_6 &ratio._q1 &ratio._q3) q1=&ratio._q1 q3 = &ratio._q3;
 run;
%MEND;


%calc_iqr_of_ratio(ratio=taetvs,data=goodtaetvs);
%calc_iqr_of_ratio(ratio=phtvs,data=goodphtvs);
%calc_iqr_of_ratio(ratio=eetvs,data=goodeetvs);
%calc_iqr_of_ratio(ratio=cftvs,data=goodcftvs);
%calc_iqr_of_ratio(ratio=cmtvs,data=goodcmtvs);

data gooddataratios;
 merge goodtaetvs_iqrs goodphtvs_iqrs goodeetvs_iqrs goodcftvs_iqrs goodcmtvs_iqrs;
by year NAICS_NEW_6;
run;

data gooddataratios (keep = year NAICS_NEW_6 taetvs_iqr_good phtvs_iqr_good eetvs_iqr_good cftvs_iqr_good cmtvs_iqr_good );
 set gooddataratios;
 taetvs_iqr_good = taetvs_q3 - taetvs_q1;
 phtvs_iqr_good = phtvs_q3 - phtvs_q1;
 eetvs_iqr_good = eetvs_q3 - eetvs_q1;
 cftvs_iqr_good = cftvs_q3 - cftvs_q1;
 cmtvs_iqr_good = cmtvs_q3 - cmtvs_q1;
run;

proc sort data=gooddataratios;
by year NAICS_NEW_6;
run;


%calc_iqr_of_ratio(ratio=taetvs,data=imptaetvs);
%calc_iqr_of_ratio(ratio=phtvs,data=impphtvs);
%calc_iqr_of_ratio(ratio=eetvs,data=impeetvs);
%calc_iqr_of_ratio(ratio=cftvs,data=impcftvs);
%calc_iqr_of_ratio(ratio=cmtvs,data=impcmtvs);

data impdataratios;
 merge imptaetvs_iqrs impphtvs_iqrs impeetvs_iqrs impcftvs_iqrs impcmtvs_iqrs;
by year NAICS_NEW_6;
run;


data impdataratios (keep = year NAICS_NEW_6 taetvs_iqr_imp phtvs_iqr_imp eetvs_iqr_imp cftvs_iqr_imp cmtvs_iqr_imp );
 set impdataratios;
 taetvs_iqr_imp = taetvs_q3 - taetvs_q1;
 phtvs_iqr_imp = phtvs_q3 - phtvs_q1;
 eetvs_iqr_imp = eetvs_q3 - eetvs_q1;
 cftvs_iqr_imp = cftvs_q3 - cftvs_q1;
 cmtvs_iqr_imp = cmtvs_q3 - cmtvs_q1;
run;


proc sort data=impdataratios;
by year NAICS_NEW_6;
run;

data goodandimp;
merge impdataratios gooddataratios;
 by year NAICS_NEW_6;
run;

data goodandimp;
set goodandimp;
 taetvs_iqr_imp_good_ratio = taetvs_iqr_imp/taetvs_iqr_good;
 phtvs_iqr_imp_good_ratio = phtvs_iqr_imp/phtvs_iqr_good;
 eetvs_iqr_imp_good_ratio = eetvs_iqr_imp/eetvs_iqr_good;
 cftvs_iqr_imp_good_ratio = cftvs_iqr_imp/cftvs_iqr_good;
 cmtvs_iqr_imp_good_ratio = cmtvs_iqr_imp/cmtvs_iqr_good;
run;

/**************/
/*  Table 2  */
/*************/

proc means data=goodandimp q1 median q3;
 var taetvs_iqr_imp_good_ratio phtvs_iqr_imp_good_ratio eetvs_iqr_imp_good_ratio  cftvs_iqr_imp_good_ratio  cmtvs_iqr_imp_good_ratio ;
by year;
title1 "Table 2: Distribution of NAIC6 industry IQRs of key ratios in imputed data";
title2 "as a fraction of the same ratio in the non-imputed data";
run;



